#
#  $Id: doc_compress.py,v 1.1 2001/03/29 21:15:44 rob Exp $
#
#  Copyright 1999-2001 Rob Tillotson <rob@pyrite.org>
#  All Rights Reserved
#
#  Permission to use, copy, modify, and distribute this software and
#  its documentation for any purpose and without fee or royalty is
#  hereby granted, provided that the above copyright notice appear in
#  all copies and that both the copyright notice and this permission
#  notice appear in supporting documentation or portions thereof,
#  including modifications, that you you make.
#
#  THE AUTHOR ROB TILLOTSON DISCLAIMS ALL WARRANTIES WITH REGARD TO
#  THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
#  AND FITNESS.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
#  SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
#  RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
#  CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
#  CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE!
#
"""Doc compression in pure Python.
"""

__version__ = '$Id: doc_compress.py,v 1.1 2001/03/29 21:15:44 rob Exp $'

__copyright__ = 'Copyright 1999-2001 Rob Tillotson <rob@pyrite.org>'


import string

COUNT_BITS = 3
DISP_BITS = 11


def compress(s):
	# optimizations
	# this cut off about 0.1 sec/call
	_find = string.find

	out = []
	space = 0
	
	# first phase: sliding window
	sstart = 0
	i = 0
	imax = len(s)
	while 1:
		if i >= imax: break
		if (i - sstart) > 2047: sstart = i - 2047
		
		#ts = s[i:i+10] # substring to search for
		#while len(ts) >= 3:
		#	f = _find(s, ts, sstart, i)
		#	if f >= 0: break
		#	ts = ts[:-1]

		# see that code above?  it's the obvious way, but it's slow.
		# what it does is basically do string.find on the data ahead
		# of the current position, on an ever-shrinking buffer.  With
		# that code, this function took around 1.6 seconds per call
		# (on a Cyrix M2-266 [207 MHz/83 MHz bus/1MB cache]).
		#
		# The below is a bit different: basically, it takes advantage
		# of the fact that if we don't have a length 3 substring, we
		# can't possibly have a length 4 substring, and so on.  Thus,
		# it first looks for a length 3 substring (we don't care about
		# anything shorter).  If it finds one, it then attempts to
		# find a length 4 substring between that position and the
		# current location, and so on.
		#
		# I suspect the primary benefit of this is to avoid 6 out of 7
		# calls to string.find every time we aren't looking at a
		# compressible string.  At any rate, changing this code cuts
		# the time to about 0.38 seconds per call.
		e = 3
		ns = sstart
		ts = s[i:i+e]
		while (imax - i) >= e and e <= 10:
			f = _find(s, ts, ns, i)
			if f < 0: break
			e = e + 1
			ts = s[i:i+e]
			ns = f
		e = e - 1

		#if len(ts) >= 3: #we have a match, f is the location and len(ts) is the length
		#	l = len(ts)
		#	ns = f
		if e >= 3:
			dist = i - ns
			byte = (dist << 3) | (e-3)
			if space:
				out.append(32)
				space = 0
			out.append(0x80 | (byte >> 8))
			out.append(byte & 0xff)
			i = i + e
			
		else:
			c = ord(s[i])
			i = i + 1
			if space:
				if c >= 0x40 and c <= 0x7f: out.append(c | 0x80)
				else:
					out.append(32)
					if c < 0x80 and (c == 0 or c > 8):
						out.append(c)
					else:
						out.append(1)
						out.append(c)
				space = 0
			else:
				if c == 32: space = 1
				else:
					if c < 0x80 and (c == 0 or c > 8):
						out.append(c)
					else:
						out.append(1)
						out.append(c)
		
	if space: out.append(32)
	# second phase: look for repetitions of '1 <x>' and combine up to 8 of them.
	# interestingly enough, in regular text this hardly makes a difference.
	return string.join(list(map(chr, out)),'')
	

def uncompress(s):
	s = list(map(ord, s))
	x = 0
	o = []
	try:
		while 1:
			c = s[x]
			x = x + 1
			if c > 0 and c < 9:  # just copy that many bytes
				for y in range(0, c):
					o.append(s[x])
					x = x + 1
			elif c < 128: # a regular ascii character
				o.append(c)
			elif c >= 0xc0: # a regular ascii character with a space before it
				o.append(32)
				o.append(c & 0x7f)
			else: # a compressed sequence
				c = c << 8
				c = c | s[x]
				x = x + 1
				m = (c & 0x3fff) >> COUNT_BITS
				n = (c & ((1 << COUNT_BITS)-1)) + 3
				for y in range(0, n):
					o.append(o[len(o)-m])
	except IndexError:
		pass
	return string.join(list(map(chr, o)), '')


